Import necessary libraries
library(dplyr)
library(funModeling)
library(MASS)
library(gridExtra)
library(tidyverse)
library(corrplot)
library(Hmisc)
library(survival)
library(survminer)
library(ggplot2)
library(ggpubr)
library(magrittr)
library(knitr)
library(rms)
library(foreign)
library(pROC)
library(timeROC)
Import and clean the dataset
# read data file and check if there is a missing value
surv_data = read_csv("Project_2_data.csv")|>
janitor::clean_names() |>
rename(regional_node_positive = reginol_node_positive) |>
mutate(
positive_ratio = regional_node_positive/regional_node_examined,
status = ifelse(status == "Alive", 0, 1)
) |>
mutate(
grade = case_when(
grade == "1" ~ 1,
grade == "2" ~ 2,
grade == "3" ~ 3,
grade == "anaplastic; Grade IV" ~ 4)) |>
mutate(across(where(is.character), as.factor))|>
relocate(status, survival_months, everything())
any(is.na(surv_data)) # FALSE
## [1] FALSE
summary(surv_data) |>
knitr::kable()
| status | survival_months | age | race | marital_status | t_stage | n_stage | x6th_stage | differentiate | grade | a_stage | tumor_size | estrogen_status | progesterone_status | regional_node_examined | regional_node_positive | positive_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :0.0000 | Min. : 1.0 | Min. :30.00 | Black: 291 | Divorced : 486 | T1:1603 | N1:2732 | IIA :1305 | Moderately differentiated:2351 | Min. :1.000 | Distant : 92 | Min. : 1.00 | Negative: 269 | Negative: 698 | Min. : 1.00 | Min. : 1.000 | Min. :0.02041 | |
| 1st Qu.:0.0000 | 1st Qu.: 56.0 | 1st Qu.:47.00 | Other: 320 | Married :2643 | T2:1786 | N2: 820 | IIB :1130 | Poorly differentiated :1111 | 1st Qu.:2.000 | Regional:3932 | 1st Qu.: 16.00 | Positive:3755 | Positive:3326 | 1st Qu.: 9.00 | 1st Qu.: 1.000 | 1st Qu.:0.10345 | |
| Median :0.0000 | Median : 73.0 | Median :54.00 | White:3413 | Separated: 45 | T3: 533 | N3: 472 | IIIA:1050 | Undifferentiated : 19 | Median :2.000 | NA | Median : 25.00 | NA | NA | Median :14.00 | Median : 2.000 | Median :0.21429 | |
| Mean :0.1531 | Mean : 71.3 | Mean :53.97 | NA | Single : 615 | T4: 102 | NA | IIIB: 67 | Well differentiated : 543 | Mean :2.151 | NA | Mean : 30.47 | NA | NA | Mean :14.36 | Mean : 4.158 | Mean :0.32647 | |
| 3rd Qu.:0.0000 | 3rd Qu.: 90.0 | 3rd Qu.:61.00 | NA | Widowed : 235 | NA | NA | IIIC: 472 | NA | 3rd Qu.:3.000 | NA | 3rd Qu.: 38.00 | NA | NA | 3rd Qu.:19.00 | 3rd Qu.: 5.000 | 3rd Qu.:0.50000 | |
| Max. :1.0000 | Max. :107.0 | Max. :69.00 | NA | NA | NA | NA | NA | NA | Max. :4.000 | NA | Max. :140.00 | NA | NA | Max. :61.00 | Max. :46.000 | Max. :1.00000 |
Checking the correlation between numerical variables
cor_matrix = surv_data |>
select_if(is.numeric) |>
cor()
corrplot::corrplot(cor_matrix,
type = "upper",
diag = FALSE,
addCoef.col = "black",
tl.col = "black",
tl.srt = 45)
The correlation matrix shows that the numerical variables in the dataset
have low to moderate correlations with each other. The relatively higher
correlation among
reginal_node_positive,
reginal_node_examined, and positive_ratio is
expected. In the later analysis, we will only consider positive ratio
for model building.
Check the correlation between categorical variables
categorical_vars <- surv_data |>
select_if(is.factor)
association_test <- function(var1, var2) {
cont_table <- table(var1, var2)
chi_test <- chisq.test(cont_table)
n <- sum(cont_table)
min_dim <- min(dim(cont_table)) - 1
cramer_v <- sqrt(chi_test$statistic / (n * min_dim))
return(list(
chi_square = chi_test$statistic,
p_value = chi_test$p.value,
cramer_v = cramer_v
))
}
results <- list()
var_names <- names(categorical_vars)
for(i in 1:(length(var_names)-1)) {
for(j in (i+1):length(var_names)) {
var1 <- var_names[i]
var2 <- var_names[j]
test_result <- association_test(
categorical_vars[[var1]],
categorical_vars[[var2]]
)
results[[paste(var1, var2, sep="-")]] <- c(
test_result,
list(var1 = var1, var2 = var2)
)
}
}
results_df <- do.call(rbind, lapply(results, function(x) {
data.frame(
Variable1 = x$var1,
Variable2 = x$var2,
Chi_Square = x$chi_square,
P_Value = x$p_value,
Cramers_V = x$cramer_v
)
})) |>
arrange(P_Value)
kable(results_df,
digits = 4,
col.names = c("var1", "var2", "chi-statistic", "P-value", "Cramer's V"))
| var1 | var2 | chi-statistic | P-value | Cramer’s V | |
|---|---|---|---|---|---|
| t_stage-x6th_stage | t_stage | x6th_stage | 6784.0786 | 0.0000 | 0.7496 |
| n_stage-x6th_stage | n_stage | x6th_stage | 6686.8341 | 0.0000 | 0.9115 |
| estrogen_status-progesterone_status | estrogen_status | progesterone_status | 1054.8431 | 0.0000 | 0.5120 |
| x6th_stage-a_stage | x6th_stage | a_stage | 729.1926 | 0.0000 | 0.4257 |
| t_stage-a_stage | t_stage | a_stage | 583.2589 | 0.0000 | 0.3807 |
| n_stage-a_stage | n_stage | a_stage | 355.5764 | 0.0000 | 0.2973 |
| t_stage-n_stage | t_stage | n_stage | 323.4137 | 0.0000 | 0.2005 |
| differentiate-estrogen_status | differentiate | estrogen_status | 217.4167 | 0.0000 | 0.2324 |
| differentiate-progesterone_status | differentiate | progesterone_status | 147.8031 | 0.0000 | 0.1917 |
| x6th_stage-differentiate | x6th_stage | differentiate | 158.0412 | 0.0000 | 0.1144 |
| race-marital_status | race | marital_status | 137.9574 | 0.0000 | 0.1309 |
| n_stage-differentiate | n_stage | differentiate | 115.5011 | 0.0000 | 0.1198 |
| t_stage-differentiate | t_stage | differentiate | 90.9569 | 0.0000 | 0.0868 |
| x6th_stage-estrogen_status | x6th_stage | estrogen_status | 52.0046 | 0.0000 | 0.1137 |
| n_stage-estrogen_status | n_stage | estrogen_status | 42.5231 | 0.0000 | 0.1028 |
| n_stage-progesterone_status | n_stage | progesterone_status | 36.8460 | 0.0000 | 0.0957 |
| x6th_stage-progesterone_status | x6th_stage | progesterone_status | 42.4854 | 0.0000 | 0.1028 |
| a_stage-estrogen_status | a_stage | estrogen_status | 15.5892 | 0.0001 | 0.0622 |
| race-differentiate | race | differentiate | 27.9028 | 0.0001 | 0.0589 |
| t_stage-estrogen_status | t_stage | estrogen_status | 19.5499 | 0.0002 | 0.0697 |
| race-estrogen_status | race | estrogen_status | 13.4090 | 0.0012 | 0.0577 |
| t_stage-progesterone_status | t_stage | progesterone_status | 13.8082 | 0.0032 | 0.0586 |
| marital_status-n_stage | marital_status | n_stage | 22.3525 | 0.0043 | 0.0527 |
| differentiate-a_stage | differentiate | a_stage | 10.5771 | 0.0142 | 0.0513 |
| marital_status-progesterone_status | marital_status | progesterone_status | 11.0469 | 0.0260 | 0.0524 |
| marital_status-x6th_stage | marital_status | x6th_stage | 28.1080 | 0.0307 | 0.0418 |
| race-progesterone_status | race | progesterone_status | 5.0431 | 0.0803 | 0.0354 |
| marital_status-differentiate | marital_status | differentiate | 19.1556 | 0.0848 | 0.0398 |
| marital_status-estrogen_status | marital_status | estrogen_status | 7.6903 | 0.1036 | 0.0437 |
| marital_status-a_stage | marital_status | a_stage | 7.6585 | 0.1049 | 0.0436 |
| a_stage-progesterone_status | a_stage | progesterone_status | 2.3828 | 0.1227 | 0.0243 |
| marital_status-t_stage | marital_status | t_stage | 17.1204 | 0.1451 | 0.0377 |
| race-n_stage | race | n_stage | 6.0797 | 0.1933 | 0.0275 |
| race-t_stage | race | t_stage | 8.4624 | 0.2061 | 0.0324 |
| race-x6th_stage | race | x6th_stage | 8.8396 | 0.3560 | 0.0331 |
| race-a_stage | race | a_stage | 0.3070 | 0.8577 | 0.0087 |
The correlation analysis of categorical variables in this breast cancer dataset reveals several key patterns. The strongest associations are found between staging-related variables, with differentiate-grade showing perfect correlation (Cramer’s V = 1.0) and n_stage-x6th_stage displaying very strong correlation (Cramer’s V = 0.91). A notable moderate correlation exists between hormone receptor statuses (estrogen_status-progesterone_status, Cramer’s V = 0.51), which aligns with biological understanding. While many other associations are statistically significant (p < 0.05), their weaker Cramer’s V values (< 0.3) suggest limited practical significance.
Check the distribution of numerical variables
surv_data |>
funModeling::plot_num()
Box-Cox transformation
surv_data = surv_data |>
mutate(
positive_ratio_transformed = {
bc = boxcox(positive_ratio ~ 1, plotit = FALSE)
lambda = bc$x[which.max(bc$y)]
if(abs(lambda) < 1e-4) {
log(positive_ratio)
} else {
(positive_ratio^lambda - 1) / lambda
}
}
)
p1 <- ggplot(surv_data, aes(x = positive_ratio)) +
geom_histogram(fill = "skyblue", bins = 30) +
labs(title = "Original Distribution",
x = "Positive Ratio")
p2 <- ggplot(surv_data, aes(x = positive_ratio_transformed)) +
geom_histogram(fill = "lightgreen", bins = 30) +
labs(title = "Transformed Distribution",
x = "Transformed Positive Ratio")
grid.arrange(p1, p2, ncol = 2)
## Model Building
survfit(Surv(survival_months,status)~race,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("White", "Black", "Other" ),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~marital_status,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Married", "Single", "Separated", "Divored", "Widowed"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~t_stage,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("T1", "T2", "T3", "T4"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~n_stage,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("N1", "N2", "N3"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~x6th_stage,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500", "#FF0000"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("IIA", "IIB", "IIIA","IIIB", "IIIC"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~differentiate,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Moderately differentiated", "Poorly differentiated", "Well differentiated", "Undifferentiated"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~grade,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("1", "2", "3", "4"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~a_stage,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Distant","Regional"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~estrogen_status,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Positive","Negative"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
survfit(Surv(survival_months,status)~progesterone_status,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Positive","Negative"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
surv_data = surv_data |>
mutate(age_group = ifelse(age > mean(age, na.rm = TRUE), "Elder", "Young"))
survfit(Surv(survival_months,status)~age_group,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Young","Elder"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
surv_data <- surv_data |>
mutate(tumor_size_group = case_when(
tumor_size < 40 ~ "Small (<40)",
tumor_size >= 40 & tumor_size < 80 ~ "Medium (40-80)",
tumor_size >= 80 & tumor_size < 120 ~ "Large (80-120)",
tumor_size >= 120 ~ "Very Large (>120)"
))
survfit(Surv(survival_months,status)~tumor_size_group,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Small (<40)", "Medium (40-80)", "Large (80-120)", "Very Large (>120)"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
surv_data <- surv_data |>
mutate(positive_ratio_group = case_when(
positive_ratio <= 0.25 ~ "Low (0.02-0.25)",
positive_ratio > 0.25 & positive_ratio <= 0.5 ~ "Moderate (0.25-0.5)",
positive_ratio > 0.5 & positive_ratio <= 0.75 ~ "High (0.5-0.75)",
positive_ratio > 0.75 ~ "Very High (0.75-1)"
))
survfit(Surv(survival_months,status)~positive_ratio_group,data = surv_data) |>
ggsurvplot(
size = 1,
cex.lab= 2,
break.time.by = 6,
xlim = c(0,107),
axis.title.x =element_text(size=5),
axis.title.y = element_text(size=5),
palette = c("#54A136", "#C757A0", "#3291ca", "#FFA500"),
conf.int = TRUE,
pval = TRUE,
risk.table = TRUE,
xlab = "Follow-up months",
ylab="Survival probability ",
risk.table.col = "strata",
risk.table.fontsize = 3,
legend.labs = c("Low (0.02-0.25)", "Moderate (0.25-0.5)", "High (0.5-0.75)", "Very High (0.75-1)"),
risk.table.height = 0.3,
ggtheme = theme_bw()
)
- All the Kaplan-Meier survival analyses show significant differences in
survival probabilities among different groups of each variable.[
Log-rank test: p-value < 0.05]
\(h(t) = h_0(t) \exp(\beta_1 X)\)
surv_object <- with(surv_data, Surv(survival_months, status))
result_unicox <- data.frame("Variable" = character(),
"Hazard Ratio" = numeric(),
"95%CI" = character(),
"P value" = numeric())
for (variable in c("age", "race", "marital_status", "t_stage", "n_stage", "x6th_stage", "differentiate", "grade", "a_stage", "tumor_size", "estrogen_status", "progesterone_status", "positive_ratio_transformed")) {
cox_formula <- as.formula(paste("surv_object ~", variable))
cox_model <- coxph(cox_formula, data = surv_data)
variable_name <- variable
hazard_ratio <- exp(coef(cox_model))
ci_lower <- exp(confint(cox_model))[1]
ci_upper <- exp(confint(cox_model))[2]
p_value <- summary(cox_model)$coefficients[5]
new_row <- data.frame("Variable" = variable_name,
"Hazard Ratio" = hazard_ratio,
"95% CI" = paste0("(", round(ci_lower, 2), "-", round(ci_upper, 2), ")"),
"P value" = p_value,
stringsAsFactors = FALSE)
result_unicox <- rbind(result_unicox, new_row)
}
result_unicox = result_unicox |>
arrange(desc(`P.value`))
knitr::kable(result_unicox,
col.names = c("Variable", "Hazard Ratio", "95% CI", "P value"))
| Variable | Hazard Ratio | 95% CI | P value | |
|---|---|---|---|---|
| differentiatePoorly differentiated | differentiate | 1.9333973 | (1.64-2.14) | 4.1456318 |
| differentiateUndifferentiated | differentiate | 4.1456318 | (1.64-2.14) | 4.1456318 |
| differentiateWell differentiated | differentiate | 0.5472506 | (1.64-2.14) | 4.1456318 |
| t_stageT2 | t_stage | 1.8304505 | (1.51-1.89) | 2.4047618 |
| t_stageT3 | t_stage | 2.4047618 | (1.51-1.89) | 2.4047618 |
| t_stageT4 | t_stage | 4.6187727 | (1.51-1.89) | 2.4047618 |
| x6th_stageIIB | x6th_stage | 1.6858472 | (1.3-2) | 1.6858472 |
| x6th_stageIIIA | x6th_stage | 2.5593598 | (1.3-2) | 1.6858472 |
| x6th_stageIIIB | x6th_stage | 4.4634086 | (1.3-2) | 1.6858472 |
| x6th_stageIIIC | x6th_stage | 6.3473591 | (1.3-2) | 1.6858472 |
| marital_statusMarried | marital_status | 0.7155876 | (0.57-1.22) | 0.7155876 |
| marital_statusSeparated | marital_status | 2.1134070 | (0.57-1.22) | 0.7155876 |
| marital_statusSingle | marital_status | 0.9140521 | (0.57-1.22) | 0.7155876 |
| marital_statusWidowed | marital_status | 1.1476127 | (0.57-1.22) | 0.7155876 |
| raceOther | race | 0.3686767 | (0.24-0.43) | 0.2098027 |
| raceWhite | race | 0.5489746 | (0.24-0.43) | 0.2098027 |
| n_stageN2 | n_stage | 2.1583532 | (1.78-3.83) | 0.0988284 |
| n_stageN3 | n_stage | 4.6243027 | (1.78-3.83) | 0.0988284 |
| age | age | 1.0157825 | (1.01-1.03) | 0.0007084 |
| a_stageRegional | a_stage | 0.3198659 | (0.23-0.45) | 0.0000000 |
| tumor_size | tumor_size | 1.0134463 | (1.01-1.02) | 0.0000000 |
| grade | grade | 1.9206998 | (1.69-2.18) | 0.0000000 |
| progesterone_statusPositive | progesterone_status | 0.3842797 | (0.32-0.45) | 0.0000000 |
| estrogen_statusPositive | estrogen_status | 0.2726864 | (0.22-0.34) | 0.0000000 |
| positive_ratio_transformed | positive_ratio_transformed | 1.8948182 | (1.73-2.08) | 0.0000000 |
According to the univariate Cox regression analysis, the variables
age, a_stage, tumor_size,
grade, progesterone_status,
estrogen_status, and positive_ratioare
significantly associated with the survival outcome (p < 0.05).
(Explain Hazard ratio… continuous/ categorical)
result_use = result_unicox |>
mutate(
significant = ifelse(result_unicox$P.value < 0.05,"significant","Not significant"))
multi_cox <- result_use[result_use$significant == "significant",]$Variable
formula <- as.formula(paste("Surv(survival_months, status) ~", paste(multi_cox, collapse = " + ")))
multi_cox_model <- coxph(formula, data = surv_data)
data_use <- summary(multi_cox_model)
multi_cox_HR <- round(data_use$coefficients[,2],2)
multi_cox_CI2.5 <- round(data_use$conf.int[,3],2)
multi_cox_CI97.5 <- mul_CI95<-round(data_use$conf.int[,4],2)
multi_cox_CI <- paste0('(',multi_cox_CI2.5,'-',multi_cox_CI97.5,')')
multi_cox_P_value <- round(data_use$coefficients[,5],3)
Variable <- row.names(data.frame(data_use$coefficients))
multi_cox_result<- data.frame(Variable,multi_cox_HR,multi_cox_CI2.5,multi_cox_CI97.5,multi_cox_CI,multi_cox_P_value)
knitr::kable(multi_cox_result)
| Variable | multi_cox_HR | multi_cox_CI2.5 | multi_cox_CI97.5 | multi_cox_CI | multi_cox_P_value | |
|---|---|---|---|---|---|---|
| age | age | 1.02 | 1.01 | 1.03 | (1.01-1.03) | 0.000 |
| a_stageRegional | a_stageRegional | 0.60 | 0.43 | 0.86 | (0.43-0.86) | 0.005 |
| tumor_size | tumor_size | 1.01 | 1.00 | 1.01 | (1-1.01) | 0.000 |
| grade | grade | 1.58 | 1.38 | 1.80 | (1.38-1.8) | 0.000 |
| progesterone_statusPositive | progesterone_statusPositive | 0.61 | 0.50 | 0.76 | (0.5-0.76) | 0.000 |
| estrogen_statusPositive | estrogen_statusPositive | 0.49 | 0.38 | 0.64 | (0.38-0.64) | 0.000 |
| positive_ratio_transformed | positive_ratio_transformed | 1.70 | 1.55 | 1.87 | (1.55-1.87) | 0.000 |
ggplot(multi_cox_result, aes(multi_cox_HR, Variable)) +
geom_vline(xintercept = 1,
linetype = "dashed",
size = 1) +
geom_errorbar(aes(xmin = multi_cox_CI2.5, xmax = multi_cox_CI97.5),width = 0.1) +
geom_point(aes(color = multi_cox_P_value),size = 5, shape = 18) +
scale_color_continuous(low = 'skyblue', high = 'red') +
labs(x = 'Hazard ratio', title = 'Forest plot for multivariate cox regression') +
theme_pubr() +
theme(legend.position = 'right')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
According to the multivariate Cox regression analysis, the variables
age, a_stage, tumor_size,
grade, progesterone_status,
estrogen_status, and positive_ratio are
significantly associated with the survival outcome (p < 0.05).
dd <- datadist(surv_data)
options(datadist = "dd")
surv_data = surv_data %>%
mutate(survival_years = survival_months / 12)
surv_object <- with(surv_data, Surv(survival_years, status == 1))
final_model <- cph(surv_object ~ age + a_stage + tumor_size + grade + progesterone_status + estrogen_status + positive_ratio,
x = TRUE ,y = TRUE, surv = TRUE, data = surv_data)
surv = Survival(final_model)
surv_1y <- function(x)surv(1,lp=x)
surv_4y <- function(x)surv(4,lp=x)
surv_8y <- function(x)surv(8,lp=x)
Nomogram_1 <- nomogram(final_model,fun = list(surv_1y,surv_4y,surv_8y),lp=F,
funlabel = c('1 year survival rate','4 year survival rate','8 year survival rate'),
maxscale = 100,fun.at = c(0.1,seq(0.1,0.9,by=0.1),0.90))
plot(Nomogram_1,
cex.axis = 0.4,
cex.var = 0.8,
cex = 0.8,
lmgp = 0.3, # 减小标签与轴线距离
label.offset = 0.2, # 增加标签偏移量
points.label = "Points",
total.points.label = "Total Points",
col.grid = gray(c(0.8, 0.95)),
labels.right = FALSE,
width.max = 1000,
height.max = 800,
mar = c(4, 4, 4, 4) # 增加边距
)
## ROC curve
pred <- predict(final_model, type="lp")
ROC_table <- data.frame(time = surv_data[,"survival_years"], status = surv_data[,"status"], score = pred)
time_roc_res <- timeROC(T = ROC_table$survival_years,
delta = ROC_table$status,
marker = ROC_table$score,
cause = 1,
weighting="marginal",
times = c(1, 4, 8),
ROC = TRUE,
iid = TRUE
)
time_ROC_df <- data.frame(TP_1year = time_roc_res$TP[, 1],
FP_1year = time_roc_res$FP[, 1],
TP_4year = time_roc_res$TP[, 2],
FP_4year = time_roc_res$FP[, 2],
TP_8year = time_roc_res$TP[, 3],
FP_8year = time_roc_res$FP[, 3]
)
ggplot(data = time_ROC_df) +
geom_line(aes(x = FP_1year, y = TP_1year), size = 1, color = "#0067B5") +
geom_line(aes(x = FP_4year, y = TP_4year), size = 1, color = "#09891D") +
geom_line(aes(x = FP_8year, y = TP_8year), size = 1, color = "#BC1328") +
geom_abline(slope = 1, intercept = 0, color = "grey", size = 1, linetype = 2) +
theme_bw() +
annotate("text",x = 0.75, y = 0.20, size = 4.5,label = paste0("AUC of 1-year survival = ", sprintf("%.3f", time_roc_res$AUC[[1]])), color = "#0067B5") +
annotate("text",x = 0.75, y = 0.15, size = 4.5,label = paste0("AUC of 4-year survival = ", sprintf("%.3f", time_roc_res$AUC[[2]])), color = "#09891D") +
annotate("text",x = 0.75, y = 0.10, size = 4.5,label = paste0("AUC of 8-year survival = ", sprintf("%.3f", time_roc_res$AUC[[3]])), color = "#BC1328") +
labs(x = "1-specificity", y = "Sensitivity") +
theme(axis.text = element_text(face = "bold", size = 11, color = "black"),
axis.title.x = element_text(face = "bold", size = 14, color = "black", margin = margin(c(15, 0, 0, 0))),
axis.title.y = element_text(face = "bold", size = 14, color = "black", margin = margin(c(0, 15, 0, 0))))
model_1 <- cph(surv_object ~ age + a_stage + tumor_size + grade + progesterone_status + estrogen_status + positive_ratio,
x = TRUE ,y = TRUE, surv = TRUE, time.inc = 1, data = surv_data)
cal_1 <- calibrate(model_1,
cmethod = 'KM',
method = 'boot',
u = 1,
B = 1000)
## Using Cox survival estimates at 1 Days
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
## Warning in groupkm(cox, Surv(y[, 1], y[, 2]), u = u, cuts = orig.cuts): one
## interval had < 2 observations
plot(cal_1,
lwd = 2,
lty = 1,
errbar.col="blue",
xlab="Nomogram-Predicted Probabilityof 1 years OS",
ylab="Actua1 1 years OS",
col="red",
subtitles=F
)
abline(0, 1, lty = 2, col = "gray")
model_2 <- cph(surv_object ~ age + a_stage + tumor_size + grade + progesterone_status + estrogen_status + positive_ratio,
x = TRUE ,y = TRUE, surv = TRUE, time.inc = 4, data = surv_data)
cal_2 <- calibrate(model_1,
cmethod = 'KM',
method = 'boot',
u = 4,
B = 1000)
## Using Cox survival estimates at 1 Days
plot(cal_2,
lwd = 2,
lty = 1,
errbar.col="blue",
xlab="Nomogram-Predicted Probabilityof 1 years OS",
ylab="Actua1 1 years OS",
col="red",
subtitles=F
)
abline(0, 1, lty = 2, col = "gray")
model_3 <- cph(surv_object ~ age + a_stage + tumor_size + grade + progesterone_status + estrogen_status + positive_ratio,
x = TRUE ,y = TRUE, surv = TRUE, time.inc = 8, data = surv_data)
cal_3 <- calibrate(model_1,
cmethod = 'KM',
method = 'boot',
u = 8,
B = 1000)
## Using Cox survival estimates at 1 Days
plot(cal_3,
lwd = 2,
lty = 1,
errbar.col="blue",
xlab="Nomogram-Predicted Probabilityof 1 years OS",
ylab="Actua1 1 years OS",
col="red",
subtitles=F
)
abline(0, 1, lty = 2, col = "gray")